;TODO: if I don't have T in cx, I can do dithering: fadd bx, dword-store to cx:ax, read ax right away
;TODO: I don't need 196 (C4) nor -512.3. They're used only in 'N=pd-(ro|p[i])=pd-ro.z|p[i].z=196+512*p[i].z' which is better

; Vector3: X grows right, Y down, Z forward.
; On the FP stack and in memory it looks like {Y X Z} (sometimes I need only Y).

org 100h ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h
C19  dw 19                 ;=19 00  adc ax,[bx+si]
C196 dw 196    ; ax=13h    ;=C4 00  les ax,[bx+si]
RO_Z equ $-1-4 ; ray_origin.z = about -512.3 (don't care about LSbyte)

%define S(x) [byte x + si-100h]

P:int 10h      ; video mode, default palette
  dec di
DI_ equ -3     ; pixel_adr@di = -3

;Each frame: the visible pixels are A0000..AF9FF, I want X=0 Y=0 in the center
M:mov dx,0xA000-10-20-20-4 ;=0x9fca
  mov es,dx    ; dx:bx=YX:XX = 0x9fca:0

;Generate gem normals to p0..p19=[bp+200h,300h,...].
  pusha  ; adr:   -18 -16 -14 -12 -10  -8  -6  -4  -2
         ; stack:  di  si  bp  sp  bx  dx  cx  ax   0
         ; data:   -2 100 9??  -2  0  9fca T  key
  mov cx,[si]
CCOLOR equ $-2
G:add bp,si    ; i@cx = 19...1; bp points to p[19-i]; carry=0
  pusha
  fninit       ; clear FP stack

  fldln2  ; IRREGULAR HEXAGONAL PYRAMID
N:fchs
  loop N             ;|z=0.693*(-1^i)
  fild word[-6-16+di-DI_]  ; pushed i
  fsincos            ;|y=cos(i) x=sin(i)   ; len=1.2167

;  fld1   ; CUBE
;  fldz    
;  fldz
;N:fchs
;  fstp st3
;  loop N

;  fldl2t  ; "CUBE"
;  fsincos
;  fldz
;N:fchs
;  fstp st3
;  loop N

;  fld1  ; DODECAHEDRON
;  fsincos
;  fldz
;N:test cl,1
;  jnz K
;  fchs
;K:fstp st3
;  loop N

;  fild word[-6+di-DI_]  ; CUBE->DODECA->RHOMIC_DODECA
;  fidiv word[si+2]
;  fsincos
;  fldz
;N:test cl,1
;  jnz K
;  fchs
;K:fstp st3
;  loop N

;  cmp cl,10 ; CUBE ELONGATED ALONG DIAGONAL
;  jb CB2
;  fldz    
;  fld1   
;  jmp CBE
;CB2:
;  fldln2
;  fldln2
;CBE:
;  fldz
;N:fchs
;  fstp st3
;  loop N
;  clc

;  test cl,4 ; INSTESECTION OF 2 CUBES
;  fld1   
;  fldz    
;  fldz
;N:fchs
;  fstp st3
;  loop N
;  jz STT

;Do a bunch of rotations. (It doesn't need to be fast.)

  mov cl,16      ; j@bx=0..15
;  popa          ; morphing shape (good with CUBE)
;  pusha
;STT:
;  add cl,3

R:fld st2            ;|z y x z                     ;|x sz y x cz
  fild word[-6+di-DI_]
  fidiv word[bx+si]  ;|t=T/[19,-15360,196,...][j]
  fsincos            ;|c=cos(t) s=sin(t) z y x z   ;|c s x sz y x cz
  fmulp st5          ;|s z y x cz                  ;|s x sz y cx cz
  fmulp              ;|sz y x cz                   ;|sx sz y cx cz
  cmc
BIG equ $-1 ;=1928710622
  jc R
  fsubp st4          ;|sz y cx cz-sx
  faddp st2          ;|y cx+sz cz-sx
  fstp st3           ;|new.x=cx+sz .y=cz-sx .z=y
  inc bx
  loop R       ; bx=16
  fstp dword[bp+si]
  fstp dword[bp+si+4]
  fstp dword[bp+si+8]
;CCOLOR equ $-2
  popa
  loop G
  popa

;Each pixel: cx=T dx:bx=YX:XX(init=9fca:0) di=adr(init=0)
X:inc dx       ; part of "dx:bx += 0x0000CCCD"
X2:
  pusha        ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  fninit       ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  mov di,-4    ; s16:  pixadr 100 9??  -2  ..X..Y  T result

;Compute ray direction.
  fild word S(BIG)
  fild word[di+4-9]
  fild word[di+4-8]  ;|y=Y x=X z=BIG
  call GEM           ;|color
  fistp word[di+4-4] ; color -> pushed ax (exploit overflow-> 0x8000)
  popa

; 4-bit builtin gray palette with cheap dithering.
  shr ax,3
  add al,bl
  mov al,0xF0
  adc al,ah
  sbb al,0xE0  ; correct for overflow (16->15)

;; Faster version: draw each pixel twice.
;  stosb
;  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
;  adc dx,0

  stosb
  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X        ;do 65536 iterations

  inc cx       ; T++
  in al,60h
  dec al
  jnz M        ; fallthrough

GEM: ;Hit the gem. Front plane @ dx, back plane @ bp
  fild dword[si]     ;|tf=0 tb=HUGE=0xC40013 y x z
  fldz
  mov cx,[si]  ; i@cx = 19...1; bx points to p[i]
  lea bx,[bp+si]

;Ray-plane intersection.
I:                   ;|tf tb rd.y .x .z
;Dot product.
  add si,12    ;108 104 100
DP:add si,di   ;-4
  fld dword[bx+si]   ;|p[i].z ...
  fmul st5           ;|rd.z*p[i].z ...
  jpo DP             ;|(rd*p[i]).y .x .z tf tb rd.y .x .z
  faddp
  faddp              ;|D=(rd|p[i]) tf tb rd.y .x .z

  fst dword[bx+di]   ; -> p[i].dot_rd
  ftst
  fnstsw ax
  sahf         ; cf=1 if we're in front of the plane
  fld dword[bx+si+8] ;|(ro|p[i]) D tf tb rd.y .x .z       ; ro.z=-19
  fldlg2
  faddp              ;|N=pd-(ro|p[i]) D tf tb rd.y .x .z  ; pd=0.301
  fdivrp st1         ;|t=N/D tf tb rd.y .x .z
  jnc BACK
FRONT:
  fcom st1
  fnstsw ax
  sahf
  jb NEXT      ;if t>=tf { tf=t; pf@bp = current; }
  fst st1
  mov bp,bx
  jmp NEXT
BACK:
  fcom st2
  fnstsw ax
  sahf
  jnb NEXT     ;if t<tb { tb=t; }
  fst st2
NEXT:
  fstp st0           ;|tf tb rd.y .x .z
  fcom
  fnstsw ax
  sahf         ;if tf>=fb { no_hit: cf=0; early exit } else { cf=1 }
  jnb EXIT
  lea bx,[bx+si]; don't overwrite carry
  loop I
EXIT:
  fcompp             ;|rd.y .x .z (get rid of 'tf tb')
  jnc BGD

;Compute the reflection from the front plane. reflect(i,n) = i - 2*n*(i|n)
  add si,12    ;108 104 100
Y:add si,di   ;-4
  fld dword[bp+di]   ;|(rd|pb) rd.y .x .z
  fmul dword[bp+si]  ;|(rd|pb)*pb.z rd.y .x .z
  fadd st0           ;|2*(rd|pb)*pb.z rd.y .x .z
  fsubr st3          ;|R.z=rd.z-2*(rd|pb)*pb.z rd.y .x .z
  jpo Y              ;|(R=i-2*n(i|n)).y R.x R.z rd.y .x .z

;Environment map: checkerboard below, light above.
BGD:                 ;|y x z
  ftst
  fabs               ;|Y=abs(y) x z
  fnstsw ax
  sahf         ; if y>0 { checker } else { sky }
  jb SKY       ; the sky is just Y (= y^2 after gamma)
C:fidiv word[si]     ;|Y/16 x z
  fdiv st2,st0       
  fdiv st1,st0       
  fxch st2           ;|16*z/Y 16*x/Y Y/16
  fiadd word[di+4-6] ;|u=T+16*z/Y v=16*x/Y Y/16
  fistp word[di]
  mov ax,[di]
  fistp word[di]
  xor ax,[di]  ; ax = u xor v
  and ax,9
  add al,8     ; tex = ((u xor v) and 8) + 8
  mov [di],ax
  fimul word[di] ;|tex*Y/16
SKY:
  ret
